# -- encoding:utf-8 --
import re
import jieba
import csv
import random测试
import openpyxl

# 加载实体字典
dics = csv.reader(open("DICT_NOW.csv", 'r', encoding='utf8'))

# 基于实体字典设置jieba分词信息(自定义词典+词性标注)
for row in dics:
    if len(row) == 2:
        # jieba.add_word是支持tag为大写字母的；jieba.load_userdict要求自定义的词性单词必须是小写的。
        jieba.add_word(row[0].strip(), tag=row[1].strip())
        jieba.suggest_freq(row[0].strip())

train = "./data/example.train"
dev = "./data/example.dev"
test = "./data/example.test"

rb = openpyxl.load_workbook('盘源训练数据3.xlsx')
sheet = rb.get_sheet_by_name('Sheet1')
h_id = 2
while h_id <= 16:
    # 地区名
    msg = sheet['E' + str(h_id)].value
    if msg:
        msg.strip()  # 去掉首尾空格
        msg = re.compile(' ').sub('', msg)  # 去掉中间空格
        text_id = random测试.randint(1, 100)
        if text_id >15 and text_id <= 100:
            with open(train, "a") as file:
                for m in msg:
                    file.write(m + " " + "O" + "\n")
                file.write("\n")
        if text_id >5 and text_id <= 15:
            with open(dev, "a") as file:
                for m in msg:
                    file.write(m + " " + "O" + "\n")
                file.write("\n")
        if text_id >0 and text_id <= 5:
            with open(test, "a") as file:
                for m in msg:
                    file.write(m + " " + "O" + "\n")
                file.write("\n")
    h_id += 1